3. Main questions
Question 1: Pedicted GI50-values
As we mentioned in our presentation, we want to create a model to predict GI50-values thus to predict, if Lapatinib is a good choice., The first linear model trys to predict the G-50 value under the data of the doubling time.
Fold_ChangeLap = select(Fold_Change, contains("Lapa"))
NegLogGI50Lap = NegLogGI50[9,]
means = colMeans(Fold_ChangeLap)
Fold_Changemeans = as.data.frame(t(means))
a2 = gsub(x = colnames (Fold_Changemeans), pattern = "_lapatinib_10000nM_24h", replacement = "")
colnames(Fold_Changemeans) = a2
a3 = gsub(x = a2, pattern = "X7", replacement = "7")
colnames(Fold_Changemeans) = a3
a1 = gsub(x = colnames (NegLogGI50Lap), pattern = "-", replacement = ".")
colnames(NegLogGI50Lap) = a1
c1 = rbind(a1,NegLogGI50Lap)
c2 = rbind(a3,Fold_Changemeans)
c1 = t(c1)
c2 = t(c2)
c1 =as.data.frame(c1)
c2 =as.data.frame(c2)
c3 = subset(c1, `1` %in% intersect(c1$`1`, c2$V1))
c4 = as.numeric(as.character(c3$lapatinib))
adjustedNeglogI50Lap = as.data.frame(c4)
Fold_Changemeans = as.data.frame(t(Fold_Changemeans))
combined1 = cbind(adjustedNeglogI50Lap, Fold_Changemeans)
names1 = c( "NegLogI50Lap","Fold_Changemeans")
colnames(combined1) = names1
lmFold = lm(NegLogI50Lap ~ Fold_Changemeans, data = combined1)
summary(lmFold)
##
## Call:
## lm(formula = NegLogI50Lap ~ Fold_Changemeans, data = combined1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0574 -0.4099 -0.1873 0.2076 2.0682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.464e+00 9.539e-02 57.281 <2e-16 ***
## Fold_Changemeans 1.913e+15 7.519e+14 2.544 0.014 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6822 on 52 degrees of freedom
## Multiple R-squared: 0.1107, Adjusted R-squared: 0.09355
## F-statistic: 6.47 on 1 and 52 DF, p-value: 0.01398
qqnorm(lmFold$residuals, main = "Test for normaldistribution of residuals")
qqline(lmFold$residuals)
plot(combined1$NegLogI50Lap, lmFold$fitted.values, pch = 20, col = "blue", xlab = "Real values",
ylab = "Predicted values", main = "Comparison: real and predicted values ~ linear regression (Fold_Changemeans)")
abline(0, 1, col = "red")
cor(combined1$NegLogI50Lap,combined1$Fold_Changemeans)
## [1] 0.3326477
#Split the data (Training - Testing)
n = nrow(combined1)
rmse1 = sqrt(1/n * sum(lmFold$residuals^2))
rmse1
## [1] 0.6694461
i1.train = sample(1:nrow(combined1), 44)
dat1.train = combined1[i1.train, ]
dat1.test = combined1[-i1.train, ]
l1.train = lm(NegLogI50Lap ~ Fold_Changemeans, data = dat1.train)
summary(l1.train)
##
## Call:
## lm(formula = NegLogI50Lap ~ Fold_Changemeans, data = dat1.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0474 -0.3434 -0.1723 0.1671 1.8605
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.483e+00 1.010e-01 54.265 <2e-16 ***
## Fold_Changemeans 1.402e+15 7.705e+14 1.819 0.076 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6462 on 42 degrees of freedom
## Multiple R-squared: 0.07305, Adjusted R-squared: 0.05098
## F-statistic: 3.31 on 1 and 42 DF, p-value: 0.07599
n = nrow(dat1.train)
rmse1.train = sqrt(1/n * sum(l1.train$residuals^2))
rmse1.train
## [1] 0.6313146
pred1 = predict(l1.train, newdata = dat1.test)
n = nrow(dat1.test)
residuals = dat1.test$NegLogI50Lap - pred1
rmse1.test1 = sqrt(1/n * sum(residuals^2))
rmse1.test1
## [1] 0.8294357
The second linear model trys to predict the G-50 value under the data of the Foldchange-means.
NegLogGI50Lap = NegLogGI50[9,]
#Sort by Cellline-Name
df = arrange(Cellline_Annotation, Cell_Line_Name)
Doublingtime = cbind.data.frame (df$Cell_Line_Name, df$Doubling_Time)
c21 = as.data.frame(t(NegLogGI50Lap))
combined2 = cbind(c21, Doublingtime$`df$Doubling_Time`)
names2 = c( "NegLogI50Lap","Doubling_Time")
colnames(combined2) = names2
combined2 =na.omit(combined2)
lmDouble = lm(NegLogI50Lap ~ Doubling_Time, data = combined2)
summary(lmDouble)
##
## Call:
## lm(formula = NegLogI50Lap ~ Doubling_Time, data = combined2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2751 -0.4124 -0.1210 0.1709 2.0784
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.147415 0.245361 20.979 <2e-16 ***
## Doubling_Time 0.010536 0.006391 1.649 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6741 on 58 degrees of freedom
## Multiple R-squared: 0.04476, Adjusted R-squared: 0.0283
## F-statistic: 2.718 on 1 and 58 DF, p-value: 0.1046
qqnorm(lmDouble$residuals, main = "Test for normaldistribution of residuals")
qqline(lmDouble$residuals)
plot(combined2$NegLogI50Lap, lmDouble$fitted.values, pch = 20, col = "blue", xlab = "Real values",
ylab = "Predicted values", main = "Comparison: real and predicted values ~ linear regression (Doubling-Time)")
abline(0, 1, col = "red")
cor(combined2$NegLogI50Lap,combined2$Doubling_Time)
## [1] 0.2115772
#Split the data (Training - Testing)
n = nrow(combined2)
rmse2 = sqrt(1/n * sum(lmDouble$residuals^2))
rmse2
## [1] 0.6627233
i2.train = sample(1:nrow(combined2), 48)
dat2.train = combined2[i2.train, ]
dat2.test = combined2[-i2.train, ]
l2.train = lm(NegLogI50Lap ~ Doubling_Time, data = dat2.train)
summary(l2.train)
##
## Call:
## lm(formula = NegLogI50Lap ~ Doubling_Time, data = dat2.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0151 -0.3563 -0.1505 0.2048 1.9440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.993835 0.267809 18.647 <2e-16 ***
## Doubling_Time 0.015180 0.007077 2.145 0.0373 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6526 on 46 degrees of freedom
## Multiple R-squared: 0.09092, Adjusted R-squared: 0.07116
## F-statistic: 4.601 on 1 and 46 DF, p-value: 0.03727
n = nrow(dat2.train)
rmse2.train = sqrt(1/n * sum(l2.train$residuals^2))
rmse2.train
## [1] 0.6388209
pred2 = predict(l2.train, newdata = dat2.test)
n = nrow(dat1.test)
residuals = dat2.test$NegLogI50Lap - pred2
rmse2.test = sqrt(1/n * sum(residuals^2))
rmse2.test
## [1] 0.8374992
As a last part, we did a multiple regression with both datasets to predict GI50-values.
b1 = gsub(x =Doublingtime$`df$Cell_Line_Name`, pattern = "-", replacement = ".")
Doublingtime1 = rbind(b1,Doublingtime$`df$Doubling_Time`)
Doublingtime1 = as.data.frame(t(Doublingtime1))
c31 = subset(Doublingtime1, b1 %in% intersect(Doublingtime1$b1, c2$V1))
c41 = as.numeric(as.character(c31$V2))
adjustedDoubling_Time = as.data.frame(c41)
combined3 = cbind(adjustedNeglogI50Lap, Fold_Changemeans, adjustedDoubling_Time)
names3 = c( "NegLogI50Lap","Fold_Changemeans","Doubling_Time")
colnames(combined3) = names3
mlr = lm(NegLogI50Lap ~ ., data = combined3)
summary(mlr)
##
## Call:
## lm(formula = NegLogI50Lap ~ ., data = combined3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3123 -0.3909 -0.1226 0.2003 1.8141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.064e+00 2.655e-01 19.069 <2e-16 ***
## Fold_Changemeans 1.819e+15 7.429e+14 2.449 0.0178 *
## Doubling_Time 1.083e-02 6.717e-03 1.612 0.1130
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6719 on 51 degrees of freedom
## Multiple R-squared: 0.1538, Adjusted R-squared: 0.1206
## F-statistic: 4.635 on 2 and 51 DF, p-value: 0.01415
qqnorm(mlr$residuals, main = "Test for normaldistribution of residuals")
qqline(mlr$residuals)
plot(combined3$NegLogI50Lap, mlr$fitted.values, pch = 20, col = "blue", xlab = "Real values",
ylab = "Predicted values" , main = "Comparison: real and predicted values ~ multiple regression")
abline(0, 1, col = "red")
#Split the data (Training - Testing)
n = nrow(combined3)
rmse3 = sqrt(1/n * sum(mlr$residuals^2))
rmse3
## [1] 0.6530072
i3.train = sample(1:nrow(combined2), 44)
dat3.train = combined3[i3.train, ]
dat3.test = combined3[-i3.train, ]
l3.train = lm(NegLogI50Lap ~ ., data = dat3.train)
summary(l3.train)
##
## Call:
## lm(formula = NegLogI50Lap ~ ., data = dat3.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2616 -0.3967 -0.1471 0.1686 1.7311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.139e+00 3.219e-01 15.966 <2e-16 ***
## Fold_Changemeans 1.614e+15 8.737e+14 1.847 0.0725 .
## Doubling_Time 8.956e-03 8.522e-03 1.051 0.2999
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6822 on 38 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.1114, Adjusted R-squared: 0.06463
## F-statistic: 2.382 on 2 and 38 DF, p-value: 0.106
n = nrow(dat3.train)
rmse3.train = sqrt(1/n * sum(l3.train$residuals^2))
rmse3.train
## [1] 0.633939
pred3 = predict(l3.train, newdata = dat3.test)
n = nrow(dat3.test)
residuals = dat3.test$NegLogI50Lap - pred3
rmse3.test = sqrt(1/n * sum(residuals^2))
rmse3.test
## [1] 0.6456633
As you can see from the data, All three regression models are not really good.
Question 2: Erlotinib vs Lapatinib
# correlation in general
n= as.data.frame(t(NegLogGI50))
rmv.rows = apply(n, 1, function(x) {
sum(is.na(x))
})
NLGI50.all = n[-which(rmv.rows > 0), ] # Removing any row with 1 or more missing values
rm(rmv.rows, n, NegLogGI50)
#cor.mat = as.data.frame(cor(NLGI50.all[, 1:ncol(NLGI50.all)], method = "pearson")) #Pearson correlation
#round(cor.mat, 2) #round values
Produce pairwise scatter plots
pairs(NLGI50.all[, 1:ncol(NLGI50.all)], pch = 20, cex = 0.8, col = "royalblue3", main = "Correlation_NegLogGI50")
matrix of correlations
cor = cor(NLGI50.all)
pheatmap(cor, col = cm.colors(256), main = "Pheatmap correlation")
Erlotinib and lapatinib have a strong correlation.
plot erlotinib all genes, coloured by tissue
#differece
diff = data.frame(erlotinib = NLGI50.all$erlotinib - mean(NLGI50.all$erlotinib), lapatinib = NLGI50.all$lapatinib- mean(NLGI50.all$lapatinib))
diff$celllines = rownames(NLGI50.all)
#create vector to insert column tissue from Metadata
tissue = sapply(1:nrow(diff), function(x) {
position = which(as.character(Metadata$cell) == diff[x, "celllines"])[1] #if tissue occurs several times, take the first
out = as.character(Metadata[position, "tissue"]) #output the tissue at this position
return(out)
})
diff$tissue = tissue
rm(tissue)
diff$celllines = factor(diff$celllines, levels = diff$celllines[order(diff$tissue)]) #Classified by tissue
ggplot(diff, aes(x = celllines, y = erlotinib, fill = tissue))+geom_bar(stat = "identity") + coord_flip() + labs(title = "Mean graph plot of NLGI50 values for Erlotinib")
The difference from the NegLogGI50 for a particular cell line and the mean NegLogGI50 is plotted here for Erlotinib.
plot lapatinib all genes, coloured by tissue
ggplot(diff, aes(x = celllines, y = lapatinib, fill = tissue)) + geom_bar(stat="identity") + coord_flip() + labs(title="Mean graph plot of NLGI50 values for Lapatinib")
The difference from the NegLogGI50 for a particular cell line and the mean NegLogGI50 is plotted here for Lapatinib.
correlation erlotinib , lapatinib
cor(NLGI50.all$erlotinib, NLGI50.all$lapatinib, method = "pearson")
## [1] 0.6528188
A Pearson correlation coefficient of ~ 0.65 confirms that these patterns are similar.
Lung genes
#only lung with mean all
### load data
Metadata_Lapatinib_treated = Metadata[which(Metadata$drug == "lapatinib" & Metadata$dose != "0nM"),]
NegLogGI50 = as.data.frame(readRDS(paste0(wd, "/Data/NegLogGI50.rds")))
#lung genes from Metadata
Lung_Metadata_L_treated = Metadata[which(Metadata$drug == "lapatinib" & Metadata$dose != "0nM" & Metadata$tissue == "Lung"),]
celllines = Lung_Metadata_L_treated$cell
NegLogGI50.lung = as.data.frame(t(NegLogGI50[c("erlotinib", "lapatinib"), celllines]))
#Difference
dif.NegLogGI50.lung = data.frame(erlotinib = NegLogGI50.lung$erlotinib - mean(NLGI50.all$erlotinib), lapatinib = NegLogGI50.lung$lapatinib - mean(NLGI50.all$lapatinib)) #erlotinib data - mean value, lapatinib data - mean value
dif.NegLogGI50.lung$celllines = rownames(NegLogGI50.lung)
# PLot Erlotinib
ggplot(dif.NegLogGI50.lung,aes(x = celllines, y = erlotinib)) + geom_bar(stat = "identity", fill = "skyblue") + geom_text(aes(label = round(erlotinib, 2)), vjust = -0.5, color = "black", size = 3) + coord_flip() + labs(title = "Mean graph plot of NLGI50 values for Erlotinib, only Lung genes")
# Plot Lapatinib
ggplot(dif.NegLogGI50.lung,aes(x = celllines, y = lapatinib)) + geom_bar(stat = "identity", fill = "skyblue") + geom_text(aes(label=round(lapatinib, 2)), vjust = -0.5, color = "black", size = 3) + coord_flip() + labs(title = "Mean graph plot of NLGI50 values for Lapatinib, only Lung genes")
#correlation lung
cor(NegLogGI50.lung$erlotinib, NegLogGI50.lung$lapatinib, method = "pearson")
## [1] 0.9609488
A pearson correlation coefficent of ~ 0.96 suggests that Lapatinib has a similar effect on lung cancer as Erlotinib
Anova
selection of Lapatinib and Erlotinib treated cells
lapa<-data.frame(Metadata[which(Metadata[,'drug'] == "lapatinib"), ])
erlo<-data.frame(Metadata[which(Metadata[,'drug'] == "erlotinib"), ])
el<-right_join(lapa,erlo, by="cell")
rmv.rows = apply(el, 1, function(x) {
sum(is.na(x))
}) # Go through each row and sum up all missing values
row.names(rmv.rows)
Create data frame with lapatinib and erlotinib data
fc<-data.frame(Treated-Untreated)
dim(fc)
## [1] 13299 819
all<-data.frame(fc[grep("lapatinib|erlotinib", colnames(fc))])
dim(all)
## [1] 13299 113
since erlotinip contains more columns than lapatinib, we have to remove these columns
all.rmv<-data.frame(all %>% select(
-"CCRF.CEM_erlotinib_10000nM_24h",
-"HL.60_erlotinib_10000nM_24h",
-"HT29_erlotinib_10000nM_24h",
-"K.562_erlotinib_10000nM_24h",
-"LOX_erlotinib_10000nM_24h",
-"SR_erlotinib_10000nM_24h",
-"COLO.205_lapatinib_10000nM_24h"))
dim(all.rmv)
## [1] 13299 106
Checking the rows
la<-data.frame(all.rmv[grep("lapatinib", colnames(all.rmv))])
ncol(la)
## [1] 53
er<-data.frame(all.rmv[grep("erlotinib", colnames(all.rmv))])
ncol(er)
## [1] 53
erla<-data.frame(er,la)
ncol(all.rmv) #to prove if the columns are removed
## [1] 106
Anova
drug<-c(rep('Erlotinib',53), rep('Lapatinib',53))
expression_drug<-apply(erla, MARGIN = 2, sum)
df_drug<-data.frame(expression_drug, drug)
library(ggpubr)
ggboxplot (data = df_drug, x="drug", y="expression_drug", color = "drug",
add = "jitter", legend = "none")+
rotate_x_text(angle = 45)+
stat_compare_means(method = "anova")+ # Add global annova p-value
stat_compare_means(label = "p.signif", method = "t.test",
ref.group = ".all.", hide.ns = TRUE) # Pairwise comparison against all
Question 3: Comparing lapatinib treated breast and cns celllines
L_fc <- select(Fold_Change, contains("Lapa"))
L_fc <- as.data.frame(t(L_fc))
rownames(Metadata) <- Metadata$sample
L_treated <- select(Treated, contains("Lapa"))
L_treated <- t(L_treated)
L_untreated <- select(Untreated, contains("Lapa"))
L_untreated <- t(L_untreated)
# selecting breast Lapatinib samples
breast <- Metadata[Metadata[,'tissue']=="Breast",]
rownames(breast) <- breast$sample
rownames(breast) <- gsub(x = rownames(breast), pattern = "-", replacement = ".")
breastFC <- subset(L_fc, rownames(L_fc) %in% rownames(breast))
breastTreated <- subset(L_treated, rownames(L_treated) %in% rownames(breast))
breastUntreated <- subset(L_untreated, rownames(L_untreated) %in% rownames(breast))#
# selecting CNS Lapatinib samples
cns <- Metadata[Metadata[,'tissue']=="CNS",]
rownames(cns) <- cns$sample
rownames(cns) <- gsub(x = rownames(cns), pattern = "-", replacement = ".")
cnsFC <- subset(L_fc, rownames(L_fc) %in% rownames(cns))
cnsTreated <- subset(L_treated, rownames(L_treated) %in% rownames(cns))
cnsUntreated <- subset(L_untreated, rownames(L_untreated) %in% rownames(cns))
#performing a paired t-test of treated and untreated samples
t_test_cns <- col_t_paired(cnsTreated, cnsUntreated, alternative = "two.sided", mu = 0,conf.level = 0.95)
t_test_breast <- col_t_paired(breastTreated, breastUntreated, alternative = "two.sided", mu = 0,conf.level = 0.95)
#obtaining Benjamini-Hochberg adjusted p-values
pval_cns <- t_test_cns$pvalue
pval_breast <- t_test_breast$pvalue
fdr_cns <- p.adjust(pval_cns, "BH")
fdr_breast <- p.adjust(pval_breast, "BH")
#obtaining mean FC values over all samples
breastFCm <- as.numeric(colMeans(breastFC))
cnsFCm <- as.numeric(colMeans(cnsFC))
genes <- colnames(breastFC)
## breast volcano plot
#creating a matrix containg all needed values for plotting
diff_df_breast <- data.frame(gene = genes, Fold = breastFCm, FDR = fdr_breast)
diff_df_breast$absFold <- abs(diff_df_breast$Fold)
head(diff_df_breast)
## gene Fold FDR absFold
## 1 A1CF 0.037268413 0.8765540 0.037268413
## 2 A2M -0.032213825 0.7188608 0.032213825
## 3 A4GALT 0.006012452 0.9793436 0.006012452
## 4 A4GNT -0.053969518 0.4235638 0.053969518
## 5 AAAS 0.081656784 0.5283372 0.081656784
## 6 AACS 0.023767096 0.8115022 0.023767096
# add a grouping column; default value is "not significant"
diff_df_breast$group <- "NotSignificant"
# change the grouping for the entries with significance but not a large enough Fold change
diff_df_breast[which(diff_df_breast['FDR'] < 0.5 & (diff_df_breast['absFold']) < 0.2 ),"group"] <- "Significant"
# change the grouping for the entries a large enough Fold change but not a low enough p value
diff_df_breast[which(diff_df_breast['FDR'] > 0.5 & (diff_df_breast['absFold']) > 0.2 ),"group"] <- "FoldChange"
# change the grouping for the entries with both significance and large enough fold change
diff_df_breast[which(diff_df_breast['FDR'] < 0.5 & (diff_df_breast['absFold']) > 0.2 ),"group"] <- "Significant&FoldChange"
# Find and label the top peaks.
top_peaks_breast <- diff_df_breast[with(diff_df_breast, order(Fold, FDR)),][1:10,]
top_peaks_breast <- rbind(top_peaks_breast, diff_df_breast[with(diff_df_breast, order(-Fold, FDR)),][1:10,])
# Add gene labels for all of the top genes we found
# creating an empty list, and filling it with entries for each row in the dataframe
# each list entry is another list with named items that will be used
a <- list()
for (i in seq_len(nrow(top_peaks_breast))) {
m <- top_peaks_breast[i, ]
a[[i]] <- list(
x = m[["Fold"]],
y = -log10(m[["FDR"]]),
text = m[["gene"]],
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 0.5,
ax = 20,
ay = -40
)
}
plot_breast <- plot_ly(data = diff_df_breast, x = diff_df_breast$Fold, y = -log10(diff_df_breast$FDR), type = "scatter", text = diff_df_breast$gene, mode = "markers", color = diff_df_breast$group) %>%
layout(title ="Volcano Plot of Lapatinib breast cancer samples",
xaxis = list(title="log2 Fold Change"),
yaxis = list(title="FDR")) %>%
layout(annotations = a)
plot_breast
###thresholds still need to be discussed
## CNS volcano plot
diff_df_cns <- data.frame(gene = genes, Fold = cnsFCm, FDR = fdr_cns)
diff_df_cns$absFold <- abs(diff_df_cns$Fold)
head(diff_df_cns)
## gene Fold FDR absFold
## 1 A1CF 0.066575311 0.5939566 0.066575311
## 2 A2M 0.038348381 0.6873009 0.038348381
## 3 A4GALT 0.000390011 0.9980719 0.000390011
## 4 A4GNT -0.018219799 0.8780106 0.018219799
## 5 AAAS 0.014723327 0.9008420 0.014723327
## 6 AACS 0.003887384 0.9870209 0.003887384
# add a grouping column; default value is "not significant"
diff_df_cns$group <- "NotSignificant"
# change the grouping for the entries with significance but not a large enough Fold change
diff_df_cns[which(diff_df_cns['FDR'] < 0.5 & (diff_df_cns['absFold']) < 0.2 ),"group"] <- "Significant"
# change the grouping for the entries a large enough Fold change but not a low enough p value
diff_df_cns[which(diff_df_cns['FDR'] > 0.5 & (diff_df_cns['absFold']) > 0.2 ),"group"] <- "FoldChange"
# change the grouping for the entries with both significance and large enough fold change
diff_df_cns[which(diff_df_cns['FDR'] < 0.5 & (diff_df_cns['absFold']) > 0.2 ),"group"] <- "Significant&FoldChange"
# Find and label the top peaks..
top_peaks_cns <- diff_df_cns[with(diff_df_cns, order(Fold, FDR)),][1:10,]
top_peaks_cns <- rbind(top_peaks_cns, diff_df_cns[with(diff_df_cns, order(-Fold, FDR)),][1:10,])
a <- list()
for (i in seq_len(nrow(top_peaks_cns))) {
m <- top_peaks_cns[i, ]
a[[i]] <- list(
x = m[["Fold"]],
y = -log10(m[["FDR"]]),
text = m[["gene"]],
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 0.5,
ax = 20,
ay = -40
)
}
plot_cns <- plot_ly(data = diff_df_cns, x = diff_df_cns$Fold, y = -log10(diff_df_cns$FDR),type = "scatter", text = diff_df_cns$gene, mode = "markers", color = diff_df_cns$group) %>%
layout(title ="Volcano Plot of Lapatinib CNS cancer samples",
xaxis = list(title="log2 Fold Change"),
yaxis = list(title="FDR"))%>%
layout(annotations = a)
plot_cns
# selecet top peak genes common in cns and breast tissue
tpb_comparison <- subset(top_peaks_breast, gene %in% top_peaks_cns$gene)
tpc_comparison <- subset(top_peaks_cns, gene %in% top_peaks_breast$gene)
# order common genes alphabetically
tpb_comparison <- tpb_comparison[order(tpb_comparison$gene),]
tpc_comparison <- tpc_comparison[order(tpc_comparison$gene),]
## creating heat map of FCs to compare values
cor_mat <- cbind("breast" = tpb_comparison$Fold, "cns" = tpc_comparison$Fold)
rownames(cor_mat) <- tpb_comparison$gene
data <- read.delim
pheatmap(
mat = cor_mat,
color = magma(10),
border_color = "black",
show_colnames = TRUE,
show_rownames = TRUE,
drop_levels = TRUE,
fontsize = 14,
main = "Comparison:
FC levels of cns and breast top peak genes"
)
#correlation between top peak gene expression patterns in breast and cns tissues treated by lapatinib
cor_mat <- as.data.frame(cor_mat)
dif.FC.BC = data.frame(breast = cor_mat$breast - mean(t(breastFC)), cns = cor_mat$cns - mean(t(cnsFC))) #breast data - mean value, cns data - mean value
dif.FC.BC$gene = rownames(cor_mat)
# PLot
ggplot(dif.FC.BC,aes(x = gene, y = breast)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = round(breast, 2)), vjust = -0.5, color = "black", size = 3) +
coord_flip() + labs(title = "Mean graph plot of Fold Change for breast top peak genes")
ggplot(dif.FC.BC,aes(x = gene, y = cns)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = round(cns, 2)), vjust = -0.5, color = "black", size = 3) +
coord_flip() + labs(title = "Mean graph plot of Fold Change for CNS top peak genes")
#correlation
cor(cor_mat$breast, cor_mat$cns, method = "pearson")
## [1] 0.921812